DATA_DIR="fairseq/data-bin/wmt17_en_de"
SAVE_DIR="./ckpt"


export PYTHONPATH=$PYTHONPATH:"/home/ubuntu/ssk/MoEResearch/MoEc_model/fairseq"
export PYTHONPATH=$PYTHONPATH:"/home/ubuntu/ssk/MoEResearch/MoEc_model/unilm"

python train.py $DATA_DIR \
        --save-dir $SAVE_DIR \
        --tensorboard-logdir $SAVE_DIR \
        --log-format simple  \
        --log-file $SAVE_DIR/train.log \
        --arch gdmoe_wmt_en_de \
        --encoder-normalize-before \
        --task translation \
        --truncate-source \
        --max-source-positions 256 \
        --max-target-positions 256 \
        --criterion label_smoothed_cross_entropy_moe \
        --label-smoothing 0.1 \
        --optimizer adam \
        --adam-betas '(0.9, 0.98)' \
        --adam-eps 1e-06 \
        --lr-scheduler inverse_sqrt \
        --lr 5e-04 \
        --warmup-init-lr 1e-07 \
        --stop-min-lr 1e-09 \
        --warmup-updates 250 \
        --max-update 32000 \
        --attention-dropout 0.1 \
        --dropout 0.3 \
        --max-tokens 4096 \
        --update-freq 16 \
        --seed 1 \
        --skip-invalid-size-inputs-valid-test \
        --fp16 \
        --fp16-no-flatten-grads \
        --ddp-backend=no_c10d \
        --token-shuffle \
        --moe-gate-loss-wt 0.01  \
        --moe-gate-loss-combine-method sum \
        --no-epoch-checkpoints \
        --clip-norm 0.1 \
        --encoder-moe-layers 3 \
        --decoder-moe-layers 3 \
        --moe-top1-expert \
        --moe-sublayers 3 \
        --moe-expert-count 64 \
        --moe-gating-use-fp32 \
        --tmoe-routing-dim-reduction \
        --tmoe-routing-dim 32 \
        --tmoe-routing-hard-cosine \
        --moe-activation-dropout 0.0 \
        --moe-dropout 0.0 \
        --capacity-factor 2 \
        --sharded-save \
        --group-num 8 \
        --exp-level-drop 0.5  \
        --dropout-interval 250 \
        --var-coef 1.0 \
        --coef-type 1